Question: how does differential localization matches with gene expression?
That is, for the different AID depletions. Is there any link between the two?
I will use the processed counts from NQ and perform simple differential analyses.
Set the parameters and list the data.
# Load dependencies
library(tidyverse)
library(GenomicRanges)
library(rtracklayer)
library(ggplot2)
library(ggbeeswarm)
library(DESeq2)
library(RColorBrewer)
library(pheatmap)
library(GGally)
library(ggrastr)
# Prepare output
output_dir <- "ts220113_GeneExpression"
dir.create(output_dir, showWarnings = FALSE)library(knitr)
opts_chunk$set(fig.width = 10, fig.height = 4, dev=c('png', 'pdf'),
message = F, warning = F,
fig.path = file.path(output_dir, "figures/"))
pdf.options(useDingbats = FALSE)DifferentialAnalysis <- function(dds, contrast, exp, control, genes_results,
lfcThreshold = 0.5, alpha = 0.05) {
main = paste(exp, "vs", control, sep = "_")
# Run DESeq2 results
res <- results(dds, lfcThreshold = lfcThreshold,
contrast = c(contrast, exp, control))
# Prepare MAplot
tib <- as_tibble(res) %>%
drop_na() %>%
mutate(shape = case_when(log2FoldChange > 8 ~ 2,
log2FoldChange < -8 ~ 2,
T ~ 1),
log2FoldChange = case_when(log2FoldChange > 8 ~ 8,
log2FoldChange < -8 ~ -8,
T ~ log2FoldChange))
# Prepare plot
plt <- tib %>%
ggplot(aes(x = baseMean, y = log2FoldChange)) +
geom_bin2d(data = tib[tib$padj >= 0.05, ], bins = 100) +
xlab("Mean expression (cpm)") +
ylab("Expression difference (log2)") +
ggtitle(main) +
ylim(-8, 8) +
scale_x_log10() +
scale_color_manual(values = c("red"), name = "Significant") +
scale_shape_discrete(guide = FALSE) +
scale_fill_gradient(low = "lightgrey", high = "black", name = "Count") +
theme_bw() +
theme(aspect.ratio = 1)
if (any(tib$padj < 0.05)) {
plt <- plt +
geom_point(data = tib[tib$padj < 0.05, ], aes(col = T,
shape = factor(shape)),
size = 1, show.legend = T)
}
plot(plt)
# Add results to genes_results
mcols(genes_results)[, paste0(main, c("_baseMean", "_log2FoldChange", "_padj"))] <-
res[, c("baseMean", "log2FoldChange", "padj")]
# Add differential results
mcols(genes_results)[, paste0(main, "_sign")] <- "stable"
mcols(genes_results)[which(res$log2FoldChange > 0 & res$padj < 0.05),
paste0(main, "_sign")] <- "up"
mcols(genes_results)[which(res$log2FoldChange < 0 & res$padj < 0.05),
paste0(main, "_sign")] <- "down"
genes_results
}Load data from NQ. These are simply gene counts.
# Load genes from NQ - filter for gene entries
genes <- import("Data_NQ/RNAseq/Mus_musculus.GRCm38.92_withChr.gtf")
genes <- genes[genes$type == "gene"]
# Load counts from NQ
#tib_wapl_cw <- read_tsv("Data_NQ/RNAseq/Wapl_CtcfWapl_RNA_htseqcounts_unnormalized.txt")
#tib_rad <- read_tsv("Data_NQ/RNAseq/Rad21_RNA_htseqcounts_unnormalized.txt")
tib_counts <- read_tsv("Data_NQ/RNAseq_all/RNA_htseqcounts_unnormalized.txt")
# Prepare metadata
metadata <- tibble(file_name = names(tib_counts)) %>%
filter(file_name != "ensembl_id") %>%
rowwise() %>%
mutate(target = case_when(grepl("CTCF-EN", file_name) ~ "CTCFEL",
grepl("CTCF-NQL", file_name) ~ "CTCFNQ",
grepl("CTCFWAPL", file_name) ~ "CTCFWAPL",
grepl("WAPL", file_name) ~ "WAPL",
grepl("RAD21", file_name) ~ "RAD21",
T ~ "WT"),
replicate = case_when(grepl("rep2", file_name) ~ "r2",
T ~ "r1"),
timepoint = case_when(grepl("96h", file_name) ~ "96h",
grepl("48h", file_name) ~ "48h",
grepl("24h", file_name) ~ "24h",
grepl("6h", file_name) ~ "6h",
T ~ "0h"),
clone = case_when(grepl("C23", file_name) ~ "C23",
grepl("C6", file_name) ~ "C6",
T ~ "X")) %>%
# Manual fix of sample mixing
# Note that we confirmed that this is the case with mCherry / GFP reads that
# are specific for both cell lines
mutate(target = case_when(replicate == "r2" & target == "CTCFEL" ~ "CTCFNQ",
replicate == "r2" & target == "CTCFNQ" ~ "CTCFEL",
T ~ target)) %>%
ungroup() %>%
mutate(new_name = paste(target, timepoint, replicate,
sep = "_"),
target_timepoint = factor(paste(target, timepoint,
sep = "_"))) %>%
mutate(target = factor(target,
levels = c("WT",
"CTCFEL", "CTCFNQ",
"RAD21",
"WAPL", "CTCFWAPL")),
timepoint = factor(timepoint,
levels = c("0h", "6h", "24h",
"48h", "96h")),
replicate = factor(replicate,
levels = c("r1", "r2")))
# Combine counts and rename
tib_counts <- tib_counts %>%
rename_all(~ c("ensembl_id", metadata$new_name)) %>%
filter(! str_detect(ensembl_id, "__")) %>%
right_join(tibble(ensembl_id = genes$gene_id))
# Re-order counts
tib_counts <- tib_counts[match(genes$gene_id, tib_counts$ensembl_id), ]
# Export as table for GEO submission
write_tsv(tib_counts %>%
rename_all(function(x) str_replace(x, "WT", "PT")) %>%
# Order columns
dplyr::select(ensembl_id,
contains("PT"),
contains("CTCFEL_"),
contains("RAD21"),
starts_with("WAPL"),
contains("CTCFWAPL")) %>%
rename_all(function(x) str_replace(x, "CTCFEL", "CTCF")),
file = file.path(output_dir, "rnaseq_gene_counts.tsv"))
# Only work with protein_coding genes and "normal" chromosomes
idx <- which(genes$gene_biotype %in% c("protein_coding") &
seqnames(genes) %in% c(paste0("chr", 1:19), "chrX"))
genes <- genes[idx]
tib_counts <- tib_counts[idx, ]Load counts into DESeq2.
set.seed(1)
# Convert tibble to named data.frame
df_counts <- as.data.frame(tib_counts %>%
dplyr::select(-ensembl_id))
row.names(df_counts) <- tib_counts$ensembl_id
# Load this into DESeq2
dds <- DESeqDataSetFromMatrix(countData = df_counts,
colData = data.frame(metadata),
design= ~ target_timepoint)
dds <- DESeq(dds)
# Get the "transformed" values and create PCA plot
dds_vsd <- vst(dds, blind = FALSE)
dds_vsd <- normTransform(dds, pc = 0.001)
pca <- plotPCA(dds_vsd, intgroup = c("target"), ntop = 5000,
returnData = F)
# Add metadata and plot
as_tibble(pca$data) %>%
bind_cols(metadata %>% dplyr::select(-target)) %>%
ggplot(aes(x = PC1, y = PC2, col = timepoint, shape = target)) +
geom_point(size = 3) +
xlab(pca$labels$x) +
ylab(pca$labels$y) +
theme_bw() +
theme(aspect.ratio = 1)as_tibble(pca$data) %>%
bind_cols(metadata %>% dplyr::select(-target)) %>%
filter(target != "CTCFNQ") %>%
ggplot(aes(x = PC1, y = PC2, col = timepoint, shape = target)) +
geom_point(size = 3) +
xlab(pca$labels$x) +
ylab(pca$labels$y) +
theme_bw() +
theme(aspect.ratio = 1)Okay, now we have the RNAseq samples (gene counts) loaded the PCA plot confirms that the samples are not too bad.
I want FPKM values to use as measure of gene expression. Get these.
To do this, I need to determine “gene length” for all the genes. Of course, RNA-seq only captures exons, so I need to gather the length of only exons.
# Call to determine gene length:
#python /home/t.v.schaik/mydata/proj/sManzo_pADamID/ts190515_pADamID_RPE_Top1_DRB/bin/gtftools.py -l Data_NQ/Mus_musculus.GRCm38.92_withChr_geneLength.bed Data_NQ/Mus_musculus.GRCm38.92_withChr.gtf
# Read gene lengths
tib_geneLength <- read_tsv("Data_NQ/RNAseq/Mus_musculus.GRCm38.92_withChr_geneLength.bed") %>%
dplyr::rename(gene_id = "gene")
# Add to genes
tib_geneLength <- tibble(gene_id = genes$gene_id) %>%
left_join(tib_geneLength)
genes$length <- tib_geneLength$mean
# Add genelength
mcols(dds)$basepairs <- genes$length
# Get FPKM
tib_fpkm <- as_tibble(fpkm(dds)) %>%
add_column(ensembl_id = tib_counts$ensembl_id)
# Get mean FPKM per sample
tmp <- do.call(cbind, tapply(metadata$new_name,
interaction(metadata$target,
metadata$timepoint),
function(x) {
if (length(x) > 1) {
rowMeans(tib_fpkm[, x])
} else {
tib_fpkm[, x]
}}))
tib_fpkm_mean <- tib_fpkm %>%
dplyr::select(ensembl_id) %>%
bind_cols(as_tibble(tmp) %>%
rename_all(function(x) gsub("\\.", "_", x)))
# Save as RDS
saveRDS(tib_fpkm,
file.path(output_dir,
"genes_fpkm.rds"))
saveRDS(tib_fpkm_mean,
file.path(output_dir,
"genes_fpkm_mean.rds"))
saveRDS(genes,
file.path(output_dir,
"genes.rds"))
# Also, write bigwig files of the FPKM
bigwig_dir <- file.path(output_dir,
"bigwig")
dir.create(bigwig_dir, showWarnings = F)
genes_ranges <- genes
mcols(genes_ranges) <- NULL
start(genes_ranges) <- end(genes_ranges) <- ifelse(strand(genes_ranges) == "+",
start(genes_ranges),
end(genes_ranges))
# Add seqinfo
chrom_info <- read.table("~/mydata/data/genomes/mm10/mm10.chrom.sizes",
sep = "\t", stringsAsFactors = F)
row.names(chrom_info) <- chrom_info[, 1]
seqlengths(genes_ranges) <- chrom_info[seqlevels(genes_ranges), 2]
# Export bigwigs
for (n in names(tib_fpkm_mean)) {
# Not for the gene ID of course
if (n == "ensembl_id") next
# Temporary ranges
genes_tmp <- genes_ranges
genes_tmp$score <- tib_fpkm_mean %>% pull(n)
# Remove duplicates ranges
genes_tmp <- genes_tmp[! duplicated(paste0(seqnames(genes_tmp),
start(genes_tmp)))]
# Remove NAs
genes_tmp <- genes_tmp[! is.na(genes_tmp$score)]
# Remove seqlevels without seqlenghts
genes_tmp <- genes_tmp[seqnames(genes_tmp) %in% seqlevels(genes_tmp)[! is.na(seqlengths(genes_tmp))]]
seqlevels(genes_tmp) <- as.character(unique(seqnames(genes_tmp)))
# Export as bigwig
export.bw(genes_tmp, file.path(bigwig_dir,
paste0(n, ".bw")))
}I want gene lists to play with. Up and downregulated genes versus stable genes. Let’s do some simple one-to-one differential tests.
# Prepare genes object with output of differential analyses
genes_results <- genes
mcols(genes_results) <- NULL
# Differential analysis - make MAplot and save results
# 1) WT timecourse
genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"WT_96h", "WT_0h",
genes_results)# 2) AID versus WT
genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFEL_0h", "WT_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFNQ_0h", "WT_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"WAPL_0h", "WT_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFWAPL_0h", "WT_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"RAD21_0h", "WT_0h",
genes_results)# 3) CTCF timecourse
# 3a) CTCF-EL
genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFEL_6h", "CTCFEL_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFEL_24h", "CTCFEL_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFEL_96h", "CTCFEL_0h",
genes_results)# 3b) CTCF-NQ
genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFNQ_6h", "CTCFNQ_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFNQ_24h", "CTCFNQ_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFNQ_96h", "CTCFNQ_0h",
genes_results)# 4) WAPL timecourse
genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"WAPL_6h", "WAPL_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"WAPL_24h", "WAPL_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"WAPL_48h", "WAPL_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"WAPL_96h", "WAPL_0h",
genes_results)# 5) CTCF-WAPL timecourse
genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFWAPL_6h", "CTCFWAPL_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFWAPL_24h", "CTCFWAPL_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFWAPL_48h", "CTCFWAPL_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"CTCFWAPL_96h", "CTCFWAPL_0h",
genes_results)# 6) RAD21 timecourse
genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"RAD21_6h", "RAD21_0h",
genes_results)genes_results <- DifferentialAnalysis(dds, "target_timepoint",
"RAD21_24h", "RAD21_0h",
genes_results)# Save as rds
saveRDS(genes_results,
file.path(output_dir,
"genes_results.rds"))Which genes are differentially expressed? First, look at the expression levels of the up and down-regulated genes.
tib <- as_tibble(mcols(genes_results)) %>%
dplyr::select(WT_96h_vs_WT_0h_baseMean,
contains("sign")) %>%
gather(key, value, -contains("baseMean")) %>%
mutate(key = str_remove(key, "_sign"),
key = factor(key, levels = unique(key)),
value = factor(value, levels = c("down", "stable", "up"))) %>%
separate(key, c("condition", "timepoint"), remove = F) %>%
filter(condition != "WT" & ! timepoint %in% c("0h", "6h")) %>%
mutate(condition = factor(condition, levels = levels(metadata$target)),
timepoint = factor(timepoint, levels = levels(metadata$timepoint)))
plt <- tib %>%
ggplot(aes(x = WT_96h_vs_WT_0h_baseMean+1, col = value, fill = value)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
facet_grid(condition ~ timepoint, scales = "free_y") +
scale_color_manual(values = c(c("blue", "darkgrey", "red"))) +
scale_fill_manual(values = c(c("blue", "darkgrey", "red"))) +
xlab("Wildtype expression (cpm)") +
ylab("Density") +
theme_bw() +
theme(aspect.ratio = 1)
plot(plt)As expected, upregulated are generally more lowly expressed in the wildtype, and vice-versa for downregulated genes.
Let’s make some sort of overview table with all the differential results. Similar as Nora et al., made but then with all the samples.
# Create one table with all the differential analysis results
tib <- as_tibble(mcols(genes_results)) %>%
dplyr::select(contains("sign"),
-matches("0h.*0h"),
-contains("NQ"))
# dplyr::select(contains("sign"),
# -matches("0h.*0h"))
gene_idx <- rowSums(tib != "stable") > 0
tib <- tib %>%
filter(gene_idx) %>%
rename_all(function(x) str_remove(x, "_sign")) %>%
rename_all(function(x) str_remove(x, "_vs.*")) %>%
mutate_all(function(x) {
x = case_when(x == "up" ~ 1,
x == "down" ~ -1,
T ~ 0)
})
# Repeat for log2
tib_log2 <- as_tibble(mcols(genes_results)) %>%
dplyr::select(contains("log2"),
-matches("0h.*0h"),
-contains("NQ")) %>%
# dplyr::select(contains("log2"),
# -matches("0h.*0h")) %>%
filter(gene_idx) %>%
rename_all(function(x) str_remove(x, "_log.*")) %>%
rename_all(function(x) str_remove(x, "_vs.*"))
# Limit extreme values to 0.99 percentiles
quantiles <- quantile(unlist(tib_log2), c(0.025, 0.975))
tib_log2_cutoff <- tib_log2 %>%
mutate_all(function(x) {
x = case_when(x > quantiles[2] ~ quantiles[2],
x < quantiles[1] ~ quantiles[1],
T ~ x)
})
# Plot
# 1) Significant calls
# 1a) Prepare clusters
hclust_gene <- hclust(dist(tib), method = "complete")
# 1b) Column groups
my_sample_col <- data.frame(class = c("WT_timecourse",
#rep("AID_vs_WT", 5),
rep("CTCFEL_timecourse", 3),
#rep("CTCFNQ_timecourse", 3),
rep("WAPL_timecourse", 4),
rep("CTCFWAPL_timecourse", 4),
rep("RAD21_timecourse", 2)),
target = c("WT",
#c("CTCFEL", "CTCFNQ",
# "WAPL", "CTCFWAPL", "RAD21"),
rep("CTCFEL", 3),
#rep("CTCFNQ", 3),
rep("WAPL", 4),
rep("CTCFWAPL", 4),
rep("RAD21", 2)))
my_sample_col$target <- factor(my_sample_col$target,
levels = c("WT", "CTCFEL", "RAD21",
"WAPL", "CTCFWAPL"))
my_sample_col$class <- factor(my_sample_col$class,
levels = paste0(levels(my_sample_col$target),
"_timecourse"))
row.names(my_sample_col) <- my_sample_col$name <- names(tib)
# Change order of objects
idx <- c(1:4, 13:14, 5:12)
tib <- tib[, idx]
tib_log2_cutoff <- tib_log2_cutoff[, idx]
my_sample_col <- my_sample_col[idx, ]
# # 1c) heatmap
# pheatmap(as.matrix(tib)[hclust_gene$order, ],
# cluster_cols = F, cluster_rows = F,
# show_rownames = F, annotation_col = my_sample_col)
#
# # 2) Log2FoldChanges
# pheatmap(as.matrix(tib_log2_cutoff)[hclust_gene$order, ],
# cluster_cols = F, cluster_rows = F,
# show_rownames = F, annotation_col = my_sample_col)
# Plot with ggplot
tib[hclust_gene$order, ] %>%
mutate(gene = 1:nrow(.)) %>%
gather(key, value, -gene) %>%
mutate(key = factor(key, levels = my_sample_col$name)) %>%
ggplot(aes(x = key, y = -gene, fill = value)) +
rasterize(geom_tile(),
dpi = 300) +
scale_fill_distiller(palette = "RdYlBu") +
#scale_fill_gradient2(low = "#4575B4", mid = "#FFFFBF", high = "#D73027") +
theme_classic() +
theme(aspect.ratio = 2,
axis.text.x = element_text(angle = 90, hjust = 1))tib_log2_cutoff[hclust_gene$order, ] %>%
mutate(gene = 1:nrow(.)) %>%
gather(key, value, -gene) %>%
mutate(key = factor(key, levels = my_sample_col$name)) %>%
ggplot(aes(x = key, y = -gene, fill = value)) +
rasterize(geom_tile(),
dpi = 300) +
scale_fill_distiller(palette = "RdYlBu", limits = c(-3.75, 3.75)) +
#scale_fill_gradient2(low = "#4575B4", mid = "#FFFFBF", high = "#D73027") +
theme_classic() +
theme(aspect.ratio = 2,
axis.text.x = element_text(angle = 90, hjust = 1))How many genes are overlapping? Can I confirm that the observed overlap is more than you would expect by chance?
To answer this, I will determine the (expected) random overlap and the actual overlap of the affected genes. Note that the sign (up / down) is taken into account. If the genes are affected in opposite orientation, this is not seen as overlap.
# Create one table with all the differential analysis results
tib <- as_tibble(mcols(genes_results)) %>%
dplyr::select(contains("sign"),
-matches("0h.*0h"),
-contains("NQ"))
# dplyr::select(contains("sign"),
# -matches("0h.*0h"))
tib <- tib %>%
rename_all(function(x) str_remove(x, "_sign")) %>%
rename_all(function(x) str_remove(x, "_vs.*")) %>%
mutate_all(function(x) {
x = case_when(x == "up" ~ 1,
x == "down" ~ -1,
T ~ 0)
})
# Only select active genes
idx_active <- tib_fpkm_mean %>%
dplyr::select(-ensembl_id,
-contains("NQ")) %>%
mutate(n_active = rowSums(. > 1),
idx_active = n_active >= 1) %>%
pull(idx_active)
tib <- tib[idx_active, ]
# Focus on 96h - most extreme effects
tib_endpoint <- tib %>%
dplyr::select(# WT_96h,
CTCFEL_96h,
RAD21_24h,
WAPL_96h,
CTCFWAPL_96h)
# Determine expected overlap
# 1) how many differentially expressed genes (in this object)
percentage_up <- tib_endpoint %>%
dplyr::summarise_all(function(x) mean(x == 1))
percentage_down <- tib_endpoint %>%
dplyr::summarise_all(function(x) mean(x == -1))
# 2) expected overlap
up_expected <- as_tibble(expand.grid(t(percentage_up)[, 1],
t(percentage_up)[, 1])) %>%
mutate(sample1 = rep(names(percentage_up), 4),
sample2 = rep(names(percentage_up), each = 4),
direction = "up") %>%
filter(sample1 != sample2)
down_expected <- as_tibble(expand.grid(t(percentage_down)[, 1],
t(percentage_down)[, 1])) %>%
mutate(sample1 = rep(names(percentage_down), 4),
sample2 = rep(names(percentage_down), each = 4),
direction = "down") %>%
filter(sample1 != sample2)
overlap_expected <- bind_rows(up_expected,
down_expected) %>%
mutate(expected_fraction = Var1 * Var2) %>%
group_by(sample1, sample2) %>%
dplyr::summarise(expected_fraction = sum(expected_fraction)) %>%
mutate(expected_number = expected_fraction * nrow(tib))
# 3) actual overlap
calc_gene_overlap <- function(s1, s2) {
# Get the values
x1 <- tib %>% pull(s1)
x2 <- tib %>% pull(s2)
# Determine the amount of overlap
up <- sum(x1 == 1 & x2 == 1)
down <- sum(x1 == -1 & x2 == -1)
# Return combined
sum(up, down)
}
overlap_expected <- overlap_expected %>%
rowwise() %>%
mutate(actual_number = calc_gene_overlap(sample1, sample2)) %>%
ungroup() %>%
mutate(actual_fraction = actual_number / nrow(tib))
# 4) enrichment
expr_levels <- names(tib_endpoint)
expr_levels_new <- paste0(expr_levels,
" (",
colSums(tib_endpoint != 0),
")")
overlap_expected <- overlap_expected %>%
mutate(enrichment = log2(actual_fraction / expected_fraction)) %>%
mutate(sample1 = factor(sample1,
levels = names(percentage_up)),
sample2 = factor(sample2, levels = names(percentage_up)))
levels(overlap_expected$sample1) <- levels(overlap_expected$sample2) <-
expr_levels_new
# 5) plot
overlap_expected %>%
ggplot(aes(x = sample1, y = sample2, fill = expected_number,
label = round(expected_number, 0))) +
geom_tile() +
geom_label(label.size = NA, size = 3) +
xlab("") +
ylab("") +
scale_fill_gradient(low = "white", high = "red",
limits = c(0, 1500)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))overlap_expected %>%
ggplot(aes(x = sample1, y = sample2, fill = actual_number,
label = round(actual_number, 0))) +
geom_tile() +
geom_label(label.size = NA, size = 3) +
xlab("") +
ylab("") +
scale_fill_gradient(low = "white", high = "red",
limits = c(0, 1500)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))overlap_expected %>%
ggplot(aes(x = sample1, y = sample2, fill = enrichment,
label = actual_number)) +
geom_tile() +
geom_label(label.size = NA, size = 3) +
xlab("") +
ylab("") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",
midpoint = 0, limits = c(-3.5, 3.5)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))overlap_expected %>%
ggplot(aes(x = sample1, y = sample2, fill = enrichment,
label = round(enrichment, 2))) +
geom_tile() +
geom_label(label.size = NA, size = 3) +
xlab("") +
ylab("") +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",
midpoint = 0, limits = c(-3.5, 3.5)) +
theme_bw() +
theme(aspect.ratio = 1,
axis.text.x = element_text(angle = 90, hjust = 1))Yes, there is a clear enrichment. I could also show this in a correlation matrix of the affected genes.
# From Fede:
# ggpairs custom functions
corColor <- function(data, mapping, color = I("black"), sizeRange = c(1, 3), ...) {
x <- eval_data_col(data, mapping$x)
y <- eval_data_col(data, mapping$y)
r <- cor(x, y)
rt <- format(r, digits = 3)
tt <- as.character(rt)
cex <- max(sizeRange)
# helper function to calculate a useable size
percent_of_range <- function(percent, range) {
percent * diff(range) + min(range, na.rm = TRUE)
}
# plot correlation coefficient
p <- ggally_text(label = tt, mapping = aes(), xP = 0.5, yP = 0.5,
# size = I(percent_of_range(cex * abs(r), sizeRange)),
size = 6,
color = color, ...) +
theme(panel.grid.minor=element_blank(),
panel.grid.major=element_blank())
corColors <- RColorBrewer::brewer.pal(n = 7, name = "RdYlBu")[2:6]
if (r <= boundaries[1]) {
corCol <- corColors[1]
} else if (r <= boundaries[2]) {
corCol <- corColors[2]
} else if (r < boundaries[3]) {
corCol <- corColors[3]
} else if (r < boundaries[4]) {
corCol <- corColors[4]
} else {
corCol <- corColors[5]
}
p <- p +
theme(panel.background = element_rect(fill = corCol))
return(p)
}
customScatter <- function (data, mapping)
{
p <- ggplot(data = data, mapping) +
geom_point(alpha = 0.2, size = 0.5) +
geom_smooth(method = "lm", se = T, col = "red") +
theme_bw()
p
}
# Use GGally to make correlation plots
boundaries <- seq(from = 0.1, to = 0.7, length.out = 4)
plt <- ggpairs(tib_log2 %>% drop_na(),
upper = list(continuous = corColor),
lower = list(continuous = customScatter),
diag = list(continuous = function(data, mapping, ...) {
ggally_densityDiag(data = data, mapping = mapping, alpha = 0.3, fill = "red") +
theme_bw()})) +
ggtitle("Correlating differential results") +
xlab("") +
ylab("")
print(plt)Let’s save bed files of the significant genes.
# Bed file dir
diff_dir <- file.path(output_dir, "differential_analysis")
dir.create(diff_dir, showWarnings = F)
# Get the first base of every gene
genes_start <- genes_results
mcols(genes_start) <- NULL
start(genes_start) <- end(genes_start) <- ifelse(strand(genes_start) == "+",
start(genes_start),
end(genes_start))
# Loop over comparisons
for (n in names(mcols(genes_results))) {
if (! grepl("sign", n)) next
# Get the results
n_res <- mcols(genes_results)[, n]
# Save differential genes
export.bed(genes_start[n_res == "up"],
file.path(diff_dir,
paste0(n, "_up.bed")))
export.bed(genes_start[n_res == "down"],
file.path(diff_dir,
paste0(n, "_down.bed")))
# Save gene lists
write_tsv(tibble(gene = genes$gene_id[n_res == "up"]), col_names = F,
file.path(diff_dir, paste0(n, "_up_list.txt")))
write_tsv(tibble(gene = genes$gene_id[n_res == "down"]), col_names = F,
file.path(diff_dir, paste0(n, "_down_list.txt")))
}
write_tsv(tibble(gene = genes$gene_id), col_names = F,
file.path(diff_dir, "genes_all_list.txt"))All depletions affect a partially overlapping set of genes. I saved these results to compare with other data in different documents.
sessionInfo()## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] knitr_1.33 ggrastr_1.0.0
## [3] GGally_2.1.2 pheatmap_1.0.12
## [5] RColorBrewer_1.1-2 DESeq2_1.30.1
## [7] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [9] MatrixGenerics_1.2.1 matrixStats_0.60.0
## [11] ggbeeswarm_0.6.0 rtracklayer_1.50.0
## [13] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7
## [15] IRanges_2.24.1 S4Vectors_0.28.1
## [17] BiocGenerics_0.36.1 forcats_0.5.1
## [19] stringr_1.4.0 dplyr_1.0.7
## [21] purrr_0.3.4 readr_2.0.0
## [23] tidyr_1.1.3 tibble_3.1.6
## [25] ggplot2_3.3.5 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 ellipsis_0.3.2 XVector_0.30.0
## [4] fs_1.5.0 rstudioapi_0.13 farver_2.1.0
## [7] bit64_4.0.5 AnnotationDbi_1.52.0 fansi_0.5.0
## [10] lubridate_1.7.10 xml2_1.3.2 codetools_0.2-18
## [13] splines_4.0.5 cachem_1.0.5 geneplotter_1.68.0
## [16] jsonlite_1.7.2 Cairo_1.5-12.2 Rsamtools_2.6.0
## [19] broom_0.7.9 annotate_1.68.0 dbplyr_2.1.1
## [22] compiler_4.0.5 httr_1.4.2 backports_1.2.1
## [25] assertthat_0.2.1 Matrix_1.3-4 fastmap_1.1.0
## [28] cli_3.1.0 htmltools_0.5.1.1 tools_4.0.5
## [31] gtable_0.3.0 glue_1.5.0 GenomeInfoDbData_1.2.4
## [34] Rcpp_1.0.7 cellranger_1.1.0 jquerylib_0.1.4
## [37] vctrs_0.3.8 Biostrings_2.58.0 xfun_0.24
## [40] rvest_1.0.1 lifecycle_1.0.1 XML_3.99-0.6
## [43] zlibbioc_1.36.0 scales_1.1.1 vroom_1.5.3
## [46] hms_1.1.0 yaml_2.2.1 memoise_2.0.0
## [49] sass_0.4.0 reshape_0.8.8 stringi_1.7.3
## [52] RSQLite_2.2.7 highr_0.9 genefilter_1.72.1
## [55] BiocParallel_1.24.1 rlang_0.4.12 pkgconfig_2.0.3
## [58] bitops_1.0-7 evaluate_0.14 lattice_0.20-44
## [61] labeling_0.4.2 GenomicAlignments_1.26.0 bit_4.0.4
## [64] tidyselect_1.1.1 plyr_1.8.6 magrittr_2.0.1
## [67] R6_2.5.1 generics_0.1.0 DelayedArray_0.16.3
## [70] DBI_1.1.1 pillar_1.6.4 haven_2.4.1
## [73] withr_2.4.2 survival_3.2-11 RCurl_1.98-1.3
## [76] modelr_0.1.8 crayon_1.4.2 utf8_1.2.2
## [79] tzdb_0.1.2 rmarkdown_2.11 locfit_1.5-9.4
## [82] grid_4.0.5 readxl_1.3.1 blob_1.2.2
## [85] reprex_2.0.0 digest_0.6.28 xtable_1.8-4
## [88] munsell_0.5.0 beeswarm_0.4.0 vipor_0.4.5
## [91] bslib_0.2.5.1